##
## Drew Linzer
## drew@votamatic.org
##
## Tom Clark
## tclark7@emory.edu
##
## January 6, 2014
## 
## refe-psrm-replication.R
## 
##  "Should I Use Fixed or Random Effects?"
##  Political Science Research and Methods
## 

library(lme4)
library(MASS)

plot.data <- function(dat, grayscale=FALSE, b0=NULL, b1=NULL, fitline=TRUE) {
  units <- length(table(dat$unit))
  plotcol <- ifelse(grayscale, "darkgray", dat$unit)
  plot(y~x, dat, xlim=c(-5, 5), ylim=c(-5, 5), 
  col=plotcol, xlab="", ylab="", pch=".")
  mtext("x", 1, 2.5)
  mtext("y", 2, 2.5)
  for (k in 1:units) {
    sel <- dat[dat$unit==k, ]
    if (is.null(b0) | is.null(b1)) {
      reg <- lm(y~x, sel)
      bhat <- coefficients(reg)
      segments(min(sel$x), bhat[1] + bhat[2]*min(sel$x), 
      max(sel$x), bhat[1] + bhat[2]*max(sel$x))
    } else {
      segments(min(sel$x), b0[k] + b1*min(sel$x), 
      max(sel$x), b0[k] + b1*max(sel$x))
    }
  }
  if (fitline) { abline(lm(y~x, dat), lwd=2) }
  invisible(NULL)
}

dgp <- function(truecor, nperunit, b1, x.sd, y.sd) {
  nunits <- length(nperunit)
  v <- mvrnorm(nunits, c(0, 0), matrix(c(1, truecor, truecor, 1), 2, 2))
  b0 <- v[, 1]
  xmean <- v[, 2]
  dat <- NULL
  for (j in 1:nunits) {
    x <- rnorm(nperunit[j], xmean[j], x.sd)
    y <- b0[j]  +  b1*x  +  rnorm(nperunit[j], 0, y.sd)
    dat <- rbind(dat, cbind(x, y, unit=j))
  }
  dat <- data.frame(dat)
  return(list(dat=dat, b0=b0, xmean=xmean))  
}




## Visualize DGPs for different amounts of correlation
quartz(width=12, height=4)
par(mar=c(4, 4, 2, 1), mfrow=c(1, 3), las=1)

truecor <- -0.7
samp <- dgp(truecor, rep(50, 10), b1=1, x.sd=0.4, y.sd=1)
b.pool <- round(coefficients(lm(y~x, samp$dat))[2], 1)
plot.data(samp$dat, grayscale=T, samp$b0, 1)
title(bquote("Correlation"~.(truecor)*"; pooled"~hat(beta)*":"~.(b.pool)))

truecor <- 0
samp <- dgp(truecor=0, rep(50, 10), b1=1, x.sd=0.4, y.sd=1)
b.pool <- round(coefficients(lm(y~x, samp$dat))[2], 1)
plot.data(samp$dat, grayscale=T, samp$b0, 1)
title(bquote("Correlation"~.(truecor)*"; pooled"~hat(beta)*":"~.(b.pool)))

truecor <- 0.7
samp <- dgp(truecor=0.7, rep(50, 10), b1=1, x.sd=0.4, y.sd=1)
b.pool <- round(coefficients(lm(y~x, samp$dat))[2], 1)
plot.data(samp$dat, grayscale=T, samp$b0, 1)
title(bquote("Correlation"~.(truecor)*"; pooled"~hat(beta)*":"~.(b.pool)))




## Compare standard to sluggish DGP; true within-unit best-fit lines shown
quartz(width=8, height=4)
par(mar=c(4, 4, 2, 1), mfrow=c(1, 2), las=1)

samp <- dgp(0, rep(50, 10), b1=1, x.sd=1, y.sd=1)
plot.data(samp$dat, grayscale=T, samp$b0, 1, fitline=F)
title('Standard DGP')

samp <- dgp(0, rep(50, 10), b1=1, x.sd=0.2, y.sd=1)
plot.data(samp$dat, grayscale=T, samp$b0, 1, fitline=F)
title('Sluggish DGP')




## Parameters to manipulate in simulation:
## [1] correlation between x and unit effect
## [2] number of units
## [3] sample size per unit
## [4] strength of association between x and y
## [5] how "spread apart" or dissimilar the units are, controlled by 
##     increasing or decreasing x.sd; smaller values make units more
##     dissimilar and farther apart from one another. This can be
##     "corrected" by making y.sd smaller, so there's less noise in 
##     the within-unit relationship given the amount of variation in x.
##     Making the variation in x small is akin to having "slow-moving" 
##     IVs in TSCS data.

truecor <- c(seq(0, 0.9, 0.1), 0.95)
units <- c(10, 40, 100)
Nperunit <- c(5, 20, 50)

b1 <- 1 # also try b1 = 0, 0.5, 2
x.sd <- 0.2 # sluggish
#x.sd <- 1 # standard
y.sd <- 1

coeflist.fe <- list()
coeflist.re <- list()
coeflist.pool <- list()
#coeflist.rc <- list() # for random coefficient model
corlist <- list()

nsim <- 2000 # about 2-3 hours

index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    cat("\nunits:", unitloop, "  Nperunit:", Nperloop, "  ", 
    as.character(Sys.time()), "\n")
    coefmat.fe <- NULL
    coefmat.re <- NULL
    coefmat.pool <- NULL
    #coefmat.rc <- NULL
    for (i in 1:length(truecor)) {
      cat("\t correlation:", truecor[i], "\n")
      flush.console()
      coef.fe <- NULL
      coef.re <- NULL
      coef.pool <- NULL
      #coef.rc <- NULL
      for (k in 1:nsim) {
        # create simulated data set
        draw <- dgp(truecor[i], rep(Nperloop, unitloop), b1, x.sd, y.sd)
        ## or make unbalanced panels
        #sampsize <- round(rnorm(unitloop, Nperloop, Nperloop/2))
        #sampsize[sampsize < 2] <- 2
        #draw <- dgp(truecor[i], sampsize, b1, x.sd, y.sd)
        ## end unbalanced panels
        dat <- draw$dat
        # estimate FE, RE, and pooled models
        reg.fe <- lm(y~x + as.factor(unit)-1, dat)
        reg.re <- lmer(y~x + (1|unit), dat)
        reg.pool <- lm(y~x, dat)
        #reg.rc <- lmer(y~x + (1 + x|unit), dat)
        # save estimates of coefficients on x
        coef.fe <- c(coef.fe, coefficients(reg.fe)[1])
        coef.re <- c(coef.re, fixef(reg.re)[2])
        coef.pool <- c(coef.pool, coefficients(reg.pool)[2])
        #coef.rc <- c(coef.rc, fixef(reg.rc)[2])
      }
      coefmat.fe <- cbind(coefmat.fe, coef.fe)
      coefmat.re <- cbind(coefmat.re, coef.re)
      coefmat.pool <- cbind(coefmat.pool, coef.pool)
      #coefmat.rc <- cbind(coefmat.rc, coef.rc)
    }
    coeflist.fe[[index]] <- coefmat.fe
    coeflist.re[[index]] <- coefmat.re
    coeflist.pool[[index]] <- coefmat.pool
    #coeflist.rc[[index]] <- coefmat.rc
    index <- index + 1
  }
}

sim.result <- list(coeflist.fe = coeflist.fe, 
                   coeflist.re = coeflist.re, 
                   coeflist.pool = coeflist.pool)




## Plot RMSE of slope estimate
## FE=solid line, RE=dashed line, pooled=dotted line
quartz(width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(2.5, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    rmse.fe <- sqrt(colMeans((coeflist.fe[[index]] - b1)^2))
    rmse.re <- sqrt(colMeans((coeflist.re[[index]] - b1)^2))
    rmse.pool <- sqrt(colMeans((coeflist.pool[[index]] - b1)^2))
    plot(truecor, rmse.fe, pch=19, type="l", ylim=c(0, 1), xlim=c(0, 1), 
    xlab="Correlation: unit means and effects", ylab="RMSE", 
    main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, rmse.re, pch=19, lty=2)
    lines(truecor, rmse.pool, pch=19, lty=3)
    index <- index + 1
  }
}




# end of file.